JASON STOPAS
MODULE 04: LAB 01
In [1]:
import gdown
import polars as pl
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
import plotly.graph_objects as go
In [2]:
drive_loc = 'https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ'
download_LC = 'downloaded_file.csv'
gdown.download(drive_loc, download_LC, quiet=False)
Jobs = pl.read_csv(download_LC)
Downloading... From (original): https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ From (redirected): https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ&confirm=t&uuid=d2c62f77-a3a3-4c81-bb4c-ea5c60293206 To: C:\Users\jtsto\OneDrive\Documents\03 BU SPRING 2025\AD 688\Mod04_Lab01\downloaded_file.csv 100%|██████████| 717M/717M [01:06<00:00, 10.8MB/s]
In [3]:
Jobs.head()
Out[3]:
shape: (5, 131)
| ID | LAST_UPDATED_DATE | LAST_UPDATED_TIMESTAMP | DUPLICATES | POSTED | EXPIRED | DURATION | SOURCE_TYPES | SOURCES | URL | ACTIVE_URLS | ACTIVE_SOURCES_INFO | TITLE_RAW | BODY | MODELED_EXPIRED | MODELED_DURATION | COMPANY | COMPANY_NAME | COMPANY_RAW | COMPANY_IS_STAFFING | EDUCATION_LEVELS | EDUCATION_LEVELS_NAME | MIN_EDULEVELS | MIN_EDULEVELS_NAME | MAX_EDULEVELS | MAX_EDULEVELS_NAME | EMPLOYMENT_TYPE | EMPLOYMENT_TYPE_NAME | MIN_YEARS_EXPERIENCE | MAX_YEARS_EXPERIENCE | IS_INTERNSHIP | SALARY | REMOTE_TYPE | REMOTE_TYPE_NAME | ORIGINAL_PAY_PERIOD | SALARY_TO | SALARY_FROM | … | SOC_2021_5_NAME | LOT_CAREER_AREA | LOT_CAREER_AREA_NAME | LOT_OCCUPATION | LOT_OCCUPATION_NAME | LOT_SPECIALIZED_OCCUPATION | LOT_SPECIALIZED_OCCUPATION_NAME | LOT_OCCUPATION_GROUP | LOT_OCCUPATION_GROUP_NAME | LOT_V6_SPECIALIZED_OCCUPATION | LOT_V6_SPECIALIZED_OCCUPATION_NAME | LOT_V6_OCCUPATION | LOT_V6_OCCUPATION_NAME | LOT_V6_OCCUPATION_GROUP | LOT_V6_OCCUPATION_GROUP_NAME | LOT_V6_CAREER_AREA | LOT_V6_CAREER_AREA_NAME | SOC_2 | SOC_2_NAME | SOC_3 | SOC_3_NAME | SOC_4 | SOC_4_NAME | SOC_5 | SOC_5_NAME | LIGHTCAST_SECTORS | LIGHTCAST_SECTORS_NAME | NAICS_2022_2 | NAICS_2022_2_NAME | NAICS_2022_3 | NAICS_2022_3_NAME | NAICS_2022_4 | NAICS_2022_4_NAME | NAICS_2022_5 | NAICS_2022_5_NAME | NAICS_2022_6 | NAICS_2022_6_NAME |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | str | str | i64 | str | str | i64 | str | str | str | str | str | str | str | str | i64 | i64 | str | str | bool | str | str | i64 | str | i64 | str | i64 | str | i64 | i64 | bool | i64 | i64 | str | str | i64 | i64 | … | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str | str | str | str | str | str | str | str | str | str | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str |
| "1f57d95acf4dc67ed2819eb12f049f… | "9/6/2024" | "2024-09-06 20:32:57.352 Z" | 0 | "6/2/2024" | "6/8/2024" | 6 | "[ "Company" ]" | "[ "brassring.com" ]" | "[ "https://sjobs.brassring.c… | "[]" | null | "Enterprise Analyst (II-III)" | "31-May-2024 Enterprise Analys… | "6/8/2024" | 6 | 894731 | "Murphy USA" | "Murphy USA" | false | "[ 2 ]" | "[ "Bachelor's degree" ]" | 2 | "Bachelor's degree" | null | null | 1 | "Full-time (> 32 hours)" | 2 | 2 | false | null | 0 | "[None]" | null | null | null | … | "Data Scientists" | 23 | "Information Technology and Com… | 231010 | "Business Intelligence Analyst" | 23101011 | "General ERP Analyst / Consulta… | 2310 | "Business Intelligence" | 23101011 | "General ERP Analyst / Consulta… | 231010 | "Business Intelligence Analyst" | 2310 | "Business Intelligence" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | "[ 7 ]" | "[ "Artificial Intelligence" … | 44 | "Retail Trade" | 441 | "Motor Vehicle and Parts Dealer… | 4413 | "Automotive Parts, Accessories,… | 44133 | "Automotive Parts and Accessori… | 441330 | "Automotive Parts and Accessori… |
| "0cb072af26757b6c4ea9464472a50a… | "8/2/2024" | "2024-08-02 17:08:58.838 Z" | 0 | "6/2/2024" | "8/1/2024" | null | "[ "Job Board" ]" | "[ "maine.gov" ]" | "[ "https://joblink.maine.gov… | "[]" | null | "Oracle Consultant - Reports (3… | "Oracle Consultant - Reports (3… | "8/1/2024" | null | 133098 | "Smx Corporation Limited" | "SMX" | true | "[ 99 ]" | "[ "No Education Listed" ]" | 99 | "No Education Listed" | null | null | 1 | "Full-time (> 32 hours)" | 3 | 3 | false | null | 1 | "Remote" | null | null | null | … | "Data Scientists" | 23 | "Information Technology and Com… | 231010 | "Business Intelligence Analyst" | 23101012 | "Oracle Consultant / Analyst" | 2310 | "Business Intelligence" | 23101012 | "Oracle Consultant / Analyst" | 231010 | "Business Intelligence Analyst" | 2310 | "Business Intelligence" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | null | null | 56 | "Administrative and Support and… | 561 | "Administrative and Support Ser… | 5613 | "Employment Services" | 56132 | "Temporary Help Services" | 561320 | "Temporary Help Services" |
| "85318b12b3331fa490d32ad014379d… | "9/6/2024" | "2024-09-06 20:32:57.352 Z" | 1 | "6/2/2024" | "7/7/2024" | 35 | "[ "Job Board" ]" | "[ "dejobs.org" ]" | "[ "https://dejobs.org/dallas… | "[]" | null | "Data Analyst" | "Taking care of people is at th… | "6/10/2024" | 8 | 39063746 | "Sedgwick" | "Sedgwick" | false | "[ 2 ]" | "[ "Bachelor's degree" ]" | 2 | "Bachelor's degree" | null | null | 1 | "Full-time (> 32 hours)" | 5 | null | false | null | 0 | "[None]" | null | null | null | … | "Data Scientists" | 23 | "Information Technology and Com… | 231113 | "Data / Data Mining Analyst" | 23111310 | "Data Analyst" | 2311 | "Data Analysis and Mathematics" | 23111310 | "Data Analyst" | 231113 | "Data / Data Mining Analyst" | 2311 | "Data Analysis and Mathematics" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | null | null | 52 | "Finance and Insurance" | 524 | "Insurance Carriers and Related… | 5242 | "Agencies, Brokerages, and Othe… | 52429 | "Other Insurance Related Activi… | 524291 | "Claims Adjusting" |
| "1b5c3941e54a1889ef4f8ae55b401a… | "9/6/2024" | "2024-09-06 20:32:57.352 Z" | 1 | "6/2/2024" | "7/20/2024" | 48 | "[ "Job Board" ]" | "[ "disabledperson.com", "d… | "[ "https://www.disabledperso… | "[]" | null | "Sr. Lead Data Mgmt. Analyst - … | "About this role: Wells Fargo … | "6/12/2024" | 10 | 37615159 | "Wells Fargo" | "Wells Fargo" | false | "[ 99 ]" | "[ "No Education Listed" ]" | 99 | "No Education Listed" | null | null | 1 | "Full-time (> 32 hours)" | 3 | null | false | null | 0 | "[None]" | null | null | null | … | "Data Scientists" | 23 | "Information Technology and Com… | 231113 | "Data / Data Mining Analyst" | 23111310 | "Data Analyst" | 2311 | "Data Analysis and Mathematics" | 23111310 | "Data Analyst" | 231113 | "Data / Data Mining Analyst" | 2311 | "Data Analysis and Mathematics" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | "[ 6 ]" | "[ "Data Privacy/Protection" … | 52 | "Finance and Insurance" | 522 | "Credit Intermediation and Rela… | 5221 | "Depository Credit Intermediati… | 52211 | "Commercial Banking" | 522110 | "Commercial Banking" |
| "cb5ca25f02bdf25c13edfede793150… | "6/19/2024" | "2024-06-19 07:00:00.000 Z" | 0 | "6/2/2024" | "6/17/2024" | 15 | "[ "FreeJobBoard" ]" | "[ "craigslist.org" ]" | "[ "https://modesto.craigslis… | "[]" | null | "Comisiones de $1000 - $3000 po… | "Comisiones de $1000 - $3000 po… | "6/17/2024" | 15 | 0 | "Unclassified" | "LH/GM" | false | "[ 99 ]" | "[ "No Education Listed" ]" | 99 | "No Education Listed" | null | null | 3 | "Part-time / full-time" | null | null | false | 92500 | 0 | "[None]" | "year" | 150000 | 35000 | … | "Data Scientists" | 23 | "Information Technology and Com… | 231010 | "Business Intelligence Analyst" | 23101012 | "Oracle Consultant / Analyst" | 2310 | "Business Intelligence" | 23101012 | "Oracle Consultant / Analyst" | 231010 | "Business Intelligence Analyst" | 2310 | "Business Intelligence" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | null | null | 99 | "Unclassified Industry" | 999 | "Unclassified Industry" | 9999 | "Unclassified Industry" | 99999 | "Unclassified Industry" | 999999 | "Unclassified Industry" |
In [4]:
Jobs.shape
Out[4]:
(72498, 131)
In [5]:
columns = Jobs.columns
for col in columns:
print(col)
ID LAST_UPDATED_DATE LAST_UPDATED_TIMESTAMP DUPLICATES POSTED EXPIRED DURATION SOURCE_TYPES SOURCES URL ACTIVE_URLS ACTIVE_SOURCES_INFO TITLE_RAW BODY MODELED_EXPIRED MODELED_DURATION COMPANY COMPANY_NAME COMPANY_RAW COMPANY_IS_STAFFING EDUCATION_LEVELS EDUCATION_LEVELS_NAME MIN_EDULEVELS MIN_EDULEVELS_NAME MAX_EDULEVELS MAX_EDULEVELS_NAME EMPLOYMENT_TYPE EMPLOYMENT_TYPE_NAME MIN_YEARS_EXPERIENCE MAX_YEARS_EXPERIENCE IS_INTERNSHIP SALARY REMOTE_TYPE REMOTE_TYPE_NAME ORIGINAL_PAY_PERIOD SALARY_TO SALARY_FROM LOCATION CITY CITY_NAME COUNTY COUNTY_NAME MSA MSA_NAME STATE STATE_NAME COUNTY_OUTGOING COUNTY_NAME_OUTGOING COUNTY_INCOMING COUNTY_NAME_INCOMING MSA_OUTGOING MSA_NAME_OUTGOING MSA_INCOMING MSA_NAME_INCOMING NAICS2 NAICS2_NAME NAICS3 NAICS3_NAME NAICS4 NAICS4_NAME NAICS5 NAICS5_NAME NAICS6 NAICS6_NAME TITLE TITLE_NAME TITLE_CLEAN SKILLS SKILLS_NAME SPECIALIZED_SKILLS SPECIALIZED_SKILLS_NAME CERTIFICATIONS CERTIFICATIONS_NAME COMMON_SKILLS COMMON_SKILLS_NAME SOFTWARE_SKILLS SOFTWARE_SKILLS_NAME ONET ONET_NAME ONET_2019 ONET_2019_NAME CIP6 CIP6_NAME CIP4 CIP4_NAME CIP2 CIP2_NAME SOC_2021_2 SOC_2021_2_NAME SOC_2021_3 SOC_2021_3_NAME SOC_2021_4 SOC_2021_4_NAME SOC_2021_5 SOC_2021_5_NAME LOT_CAREER_AREA LOT_CAREER_AREA_NAME LOT_OCCUPATION LOT_OCCUPATION_NAME LOT_SPECIALIZED_OCCUPATION LOT_SPECIALIZED_OCCUPATION_NAME LOT_OCCUPATION_GROUP LOT_OCCUPATION_GROUP_NAME LOT_V6_SPECIALIZED_OCCUPATION LOT_V6_SPECIALIZED_OCCUPATION_NAME LOT_V6_OCCUPATION LOT_V6_OCCUPATION_NAME LOT_V6_OCCUPATION_GROUP LOT_V6_OCCUPATION_GROUP_NAME LOT_V6_CAREER_AREA LOT_V6_CAREER_AREA_NAME SOC_2 SOC_2_NAME SOC_3 SOC_3_NAME SOC_4 SOC_4_NAME SOC_5 SOC_5_NAME LIGHTCAST_SECTORS LIGHTCAST_SECTORS_NAME NAICS_2022_2 NAICS_2022_2_NAME NAICS_2022_3 NAICS_2022_3_NAME NAICS_2022_4 NAICS_2022_4_NAME NAICS_2022_5 NAICS_2022_5_NAME NAICS_2022_6 NAICS_2022_6_NAME
filter for those with salary info
In [6]:
Jobs_w_salary_info = Jobs.filter(
~(
Jobs["SALARY"].is_null() |
Jobs["SALARY_TO"].is_null() |
Jobs["SALARY_FROM"].is_null()
)
)
In [7]:
Jobs_w_salary_info.select(["SALARY_FROM", "SALARY_TO", "SALARY"]).describe()
Out[7]:
shape: (9, 4)
| statistic | SALARY_FROM | SALARY_TO | SALARY |
|---|---|---|---|
| str | f64 | f64 | f64 |
| "count" | 30808.0 | 30808.0 | 30808.0 |
| "null_count" | 0.0 | 0.0 | 0.0 |
| "mean" | 96108.560277 | 139349.489938 | 117953.755031 |
| "std" | 38651.515935 | 57867.606046 | 45133.878359 |
| "min" | 10230.0 | 16640.0 | 15860.0 |
| "25%" | 66560.0 | 96471.0 | 84933.0 |
| "50%" | 90000.0 | 135000.0 | 116300.0 |
| "75%" | 119800.0 | 178200.0 | 145600.0 |
| "max" | 500000.0 | 500000.0 | 500000.0 |
In [8]:
Jobs_w_salary_info.shape
Out[8]:
(30808, 131)
In [9]:
Jobs_w_salary_info.select(
pl.col("EMPLOYMENT_TYPE_NAME").value_counts()
)
Out[9]:
shape: (3, 1)
| EMPLOYMENT_TYPE_NAME |
|---|
| struct[2] |
| {"Part-time (≤ 32 hours)",1038} |
| {"Full-time (> 32 hours)",29151} |
| {"Part-time / full-time",619} |
In [10]:
Jobs_w_salary_info.select(
pl.col("EMPLOYMENT_TYPE").value_counts()
)
Out[10]:
shape: (3, 1)
| EMPLOYMENT_TYPE |
|---|
| struct[2] |
| {1,29151} |
| {2,1038} |
| {3,619} |
In [11]:
Jobs_w_salary_info.group_by("EMPLOYMENT_TYPE_NAME").agg([
pl.len().alias("Job_Count"),
pl.col("SALARY").mean().alias("Avg_Salary"),
pl.col("SALARY").median().alias("Median_Salary"),
pl.col("SALARY").min().alias("Min_Salary"),
pl.col("SALARY").max().alias("Max_Salary"),
pl.col("SALARY").std().alias("Salary_StdDev")
]).sort("Avg_Salary", descending=True)
Out[11]:
shape: (3, 7)
| EMPLOYMENT_TYPE_NAME | Job_Count | Avg_Salary | Median_Salary | Min_Salary | Max_Salary | Salary_StdDev |
|---|---|---|---|---|---|---|
| str | u32 | f64 | f64 | i64 | i64 | f64 |
| "Full-time (> 32 hours)" | 29151 | 118897.558609 | 116500.0 | 20583 | 500000 | 44351.533443 |
| "Part-time / full-time" | 619 | 105621.242326 | 100000.0 | 20800 | 455375 | 52979.422642 |
| "Part-time (≤ 32 hours)" | 1038 | 98802.509634 | 86390.0 | 15860 | 310050 | 55382.720356 |
In [12]:
df_plot = Jobs_w_salary_info.select(["EMPLOYMENT_TYPE_NAME", "SALARY_FROM"]).to_pandas()
employment_types = df_plot["EMPLOYMENT_TYPE_NAME"].unique()
plot_01 = go.Figure()
for emp_type in employment_types:
plot_01.add_trace(
go.Box(
y=df_plot[df_plot["EMPLOYMENT_TYPE_NAME"] == emp_type]["SALARY_FROM"],
name=emp_type,
boxpoints='outliers',
marker_color='lightgray',
line_color='black',
showlegend=False
)
)
plot_01.add_trace(
go.Scatter(
x=df_plot["EMPLOYMENT_TYPE_NAME"],
y=df_plot["SALARY_FROM"],
mode='markers',
marker=dict(
color=df_plot["SALARY_FROM"],
colorscale='Viridis',
showscale=True,
size=6,
opacity=0.7,
colorbar=dict(title="Starting Salary")
),
name='Salary Points',
hoverinfo='x+y'
)
)
plot_01.update_layout(
title="Starting Salary Distribution by Employment Type",
xaxis_title="Employment Type",
yaxis_title="Starting Salary",
xaxis_tickangle=-45,
template='plotly_white',
yaxis=dict(
tick0=0,
dtick=50000,
gridcolor="lightgray"
),
margin=dict(t=80, b=100, l=80, r=40),
boxmode='group',
boxgap=0.4,
height=600,
width=1000
)
plot_01.show()
In [13]:
df_plot = Jobs_w_salary_info.select(["NAICS2_NAME", "SALARY_FROM"]).to_pandas()
industries = df_plot["NAICS2_NAME"].unique()
plot_02 = go.Figure()
for industry in industries:
plot_02.add_trace(
go.Box(
y=df_plot[df_plot["NAICS2_NAME"] == industry]["SALARY_FROM"],
name=industry,
boxpoints='outliers',
marker_color='lightgray',
line_color='black',
showlegend=False
)
)
plot_02.add_trace(
go.Scatter(
x=df_plot["NAICS2_NAME"],
y=df_plot["SALARY_FROM"],
mode='markers',
marker=dict(
color=df_plot["SALARY_FROM"],
colorscale='Turbo',
showscale=True,
size=6,
opacity=0.7,
colorbar=dict(title="Starting Salary")
),
name='Salary Points',
hoverinfo='x+y'
)
)
plot_02.update_layout(
title="Starting Salary Distribution by Industry (NAICS2_NAME)",
xaxis_title="Industry (NAICS2)",
yaxis_title="Starting Salary",
xaxis_tickangle=-45,
template='plotly_white',
yaxis=dict(
tick0=0,
dtick=50000,
gridcolor="lightgray"
),
margin=dict(
t=80,
b=120,
l=80,
r=40
),
boxmode='group',
boxgap=0.4,
height=700,
width=1200
)
plot_02.show()
In [14]:
Jobs_with_dates = Jobs.with_columns(
pl.col("POSTED").cast(pl.Utf8).str.strptime(pl.Date, "%m/%d/%Y", strict=False).alias("POSTED_DATE")
)
daily_postings = (
Jobs_with_dates
.group_by("POSTED_DATE")
.agg(pl.len().alias("Post_Count"))
.sort("POSTED_DATE")
)
df_line = daily_postings.to_pandas()
df_line["Smoothed"] = df_line["Post_Count"].rolling(window=7, center=True).mean()
In [15]:
plot_03 = go.Figure()
# Raw daily line (gray)
plot_03.add_trace(go.Scatter(
x=df_line["POSTED_DATE"],
y=df_line["Post_Count"],
mode="lines",
name="Raw Daily Count",
line=dict(color="lightgray", width=2)
))
# Smoothed line (red)
plot_03.add_trace(go.Scatter(
x=df_line["POSTED_DATE"],
y=df_line["Smoothed"],
mode="lines",
name="7-Day Smoothed",
line=dict(color="red", width=3)
))
# Layout and formatting
plot_03.update_layout(
title="Job Postings Over Time (Raw + 7-Day Smoothed)",
xaxis_title="Date",
yaxis_title="Number of Postings",
template="plotly_white",
height=500,
width=1000,
xaxis_tickangle=-45,
margin=dict(t=60, b=100, l=60, r=40),
legend=dict(x=0.01, y=0.99, bordercolor="gray", borderwidth=1)
)
plot_03.show()
In [16]:
Jobs_with_dates.select([
pl.col("POSTED_DATE").min().alias("Earliest"),
pl.col("POSTED_DATE").max().alias("Latest")
])
Out[16]:
shape: (1, 2)
| Earliest | Latest |
|---|---|
| date | date |
| 2024-05-01 | 2024-09-30 |
In [17]:
top_titles = (
Jobs.group_by("TITLE_NAME")
.agg(pl.len().alias("Count"))
.sort("Count", descending=True)
.limit(10)
)
In [18]:
df_top_titles = top_titles.to_pandas()
In [19]:
plot_04 = px.bar(
df_top_titles,
x="TITLE_NAME",
y="Count",
color="TITLE_NAME", # Different color per title
title="Top 10 Job Titles by Number of Postings",
labels={"TITLE_NAME": "Job Title", "Count": "Number of Postings"},
template="plotly_white"
)
plot_04.update_layout(
xaxis_tickangle=-45,
height=500,
width=1000,
showlegend=False, # Optional: hide legend since x-axis already shows titles
margin=dict(t=60, b=100, l=60, r=40)
)
plot_04.show()
In [20]:
Jobs.select("REMOTE_TYPE_NAME").unique()
Out[20]:
shape: (5, 1)
| REMOTE_TYPE_NAME |
|---|
| str |
| "Not Remote" |
| "Hybrid Remote" |
| null |
| "Remote" |
| "[None]" |
In [21]:
Jobs_cleaned_remote = Jobs.with_columns(
pl.when(
pl.col("REMOTE_TYPE_NAME").is_null() |
(pl.col("REMOTE_TYPE_NAME").cast(pl.Utf8).str.strip_chars("[]") == "None")
)
.then(pl.lit("Unknown"))
.otherwise(pl.col("REMOTE_TYPE_NAME"))
.alias("REMOTE_TYPE_NAME")
)
In [22]:
Jobs_cleaned_remote.select("REMOTE_TYPE_NAME").unique()
Out[22]:
shape: (4, 1)
| REMOTE_TYPE_NAME |
|---|
| str |
| "Not Remote" |
| "Remote" |
| "Unknown" |
| "Hybrid Remote" |
In [23]:
remote_counts = (
Jobs_cleaned_remote
.group_by("REMOTE_TYPE_NAME")
.agg(pl.len().alias("Count"))
.sort("Count", descending=True)
)
In [24]:
df_remote = remote_counts.to_pandas()
plot_05 = px.pie(
df_remote,
names="REMOTE_TYPE_NAME",
values="Count",
title="Distribution of Remote Work Types",
color_discrete_sequence=px.colors.qualitative.Set3
)
plot_05.update_traces(textposition='inside', textinfo='percent+label')
plot_05.update_layout(
template="plotly_white",
margin=dict(t=60, b=60, l=60, r=60)
)
plot_05.show()
In [25]:
Jobs.select("SKILLS_NAME").unique().shape
Out[25]:
(44173, 1)
In [26]:
Jobs.select("SKILLS_NAME").unique().head(20)
Out[26]:
shape: (20, 1)
| SKILLS_NAME |
|---|
| str |
| "[ "Communication", "Integr… |
| "[ "Databricks", "Curiosity… |
| "[ "Management", "Metadata … |
| "[ "Business Objectives", "… |
| "[ "Detail Oriented", "Ware… |
| … |
| "[ "Relational Databases", … |
| "[ "Azure DevOps", "Researc… |
| "[ "Information Technology", … |
| "[ "Object-Oriented Programmi… |
| "[ "Merchandising", "Market… |
In [27]:
exploded = Jobs.with_columns(
pl.col("SKILLS_NAME").cast(pl.List(pl.Utf8))
).explode("SKILLS_NAME")
In [28]:
skills_per_industry = (
exploded
.group_by("NAICS_2022_6_NAME")
.agg(pl.len().alias("Skill_Count"))
.sort("Skill_Count", descending=True)
.limit(15)
)
In [29]:
df_industry_skills = skills_per_industry.to_pandas()
In [30]:
plot_06 = px.bar(
df_industry_skills,
x="NAICS_2022_6_NAME",
y="Skill_Count",
title="Top 15 Industries by Skill Mentions",
labels={
"NAICS_2022_6_NAME": "Industry",
"Skill_Count": "Total Skill Mentions"
},
template="plotly_white",
color="NAICS_2022_6_NAME",
color_discrete_sequence=px.colors.qualitative.Pastel
)
plot_06.update_layout(
xaxis_tickangle=-45,
height=700,
width=1200,
margin=dict(t=60, b=140, l=60, r=40),
showlegend=False
)
plot_06.show()
In [31]:
Jobs.select("ONET_NAME").unique()
Out[31]:
shape: (2, 1)
| ONET_NAME |
|---|
| str |
| null |
| "Business Intelligence Analysts" |
In [32]:
Jobs_w_salary_info.select("ONET_NAME").unique()
Out[32]:
shape: (1, 1)
| ONET_NAME |
|---|
| str |
| "Business Intelligence Analysts" |
In [33]:
Jobs_w_salary_info.filter(
pl.col("ONET_NAME") == "Business Intelligence Analysts"
).select(
pl.median("SALARY").alias("Median_Salary")
)
Out[33]:
shape: (1, 1)
| Median_Salary |
|---|
| f64 |
| 116300.0 |
In [34]:
plot_07 = px.bar(
x=["Business Intelligence Analysts"],
y=[Jobs_w_salary_info.filter(pl.col("ONET_NAME") == "Business Intelligence Analysts")
.select(pl.median("SALARY"))[0, 0]],
labels={"x": "[ONET] Occupation", "y": "Median Salary"},
title="Median Salary: Business Intelligence Analysts",
template="plotly_white"
)
plot_07.update_traces(marker_color='teal')
plot_07.show()
In [42]:
Jobs.select("SOC_2021_2_NAME").unique()
Out[42]:
shape: (2, 1)
| SOC_2021_2_NAME |
|---|
| str |
| "Computer and Mathematical Occu… |
| null |
In [48]:
Jobs.group_by("SOC_2021_2_NAME").agg(
pl.len().alias("Post_Count")
).sort("Post_Count", descending=True)
Out[48]:
shape: (2, 2)
| SOC_2021_2_NAME | Post_Count |
|---|---|
| str | u32 |
| "Computer and Mathematical Occu… | 72454 |
| null | 44 |
In [36]:
Jobs.select("SOC_2021_3_NAME").unique()
Out[36]:
shape: (2, 1)
| SOC_2021_3_NAME |
|---|
| str |
| "Mathematical Science Occupatio… |
| null |
In [49]:
Jobs.group_by("SOC_2021_3_NAME").agg(
pl.len().alias("Post_Count")
).sort("Post_Count", descending=True)
Out[49]:
shape: (2, 2)
| SOC_2021_3_NAME | Post_Count |
|---|---|
| str | u32 |
| "Mathematical Science Occupatio… | 72454 |
| null | 44 |
In [37]:
transitions = (
Jobs.group_by(["SOC_2021_2_NAME", "SOC_2021_3_NAME"])
.agg(pl.len().alias("Count"))
.filter(pl.col("SOC_2021_2_NAME").is_not_null() & pl.col("SOC_2021_3_NAME").is_not_null())
)
In [38]:
df_transitions = transitions.to_pandas()
In [39]:
all_labels = pd.unique(
np.concatenate([
df_transitions["SOC_2021_2_NAME"].values,
df_transitions["SOC_2021_3_NAME"].values
])
)
label_to_index = {label: idx for idx, label in enumerate(all_labels)}
df_transitions["source"] = df_transitions["SOC_2021_2_NAME"].map(label_to_index)
df_transitions["target"] = df_transitions["SOC_2021_3_NAME"].map(label_to_index)
In [40]:
plot_08 = go.Figure(data=[go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=list(label_to_index.keys()),
color="lightblue"
),
link=dict(
source=df_transitions["source"],
target=df_transitions["target"],
value=df_transitions["Count"]
)
)])
plot_08.update_layout(
title_text="SOC Transitions: 2-Digit → 3-Digit Level",
font_size=12,
height=700,
width=1000
)
plot_08.show()
In [ ]: